This is a Rmd file analyzing our raw count data by the glmmSeq package as described in the vignette and manual.
sessionInfo() #provides list of loaded packages and version of R. I still have version 4.1 for now.
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.0
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.34 R6_2.5.1 fastmap_1.1.1 xfun_0.42
## [5] cachem_1.0.8 knitr_1.45 htmltools_0.5.7 rmarkdown_2.25
## [9] lifecycle_1.0.4 cli_3.6.2 sass_0.4.8 jquerylib_0.1.4
## [13] compiler_4.3.2 rstudioapi_0.15.0 tools_4.3.2 evaluate_0.23
## [17] bslib_0.6.1 yaml_2.3.8 rlang_1.1.3 jsonlite_1.8.8
if ("rtracklayer" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install("rtracklayer")
if ("goseq" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install('goseq')
if ("rrvgo" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install("rrvgo")
if ("GO.db" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install("GO.db")
#BiocManager::install("org.Ce.eg.db", force=TRUE) #install if needed
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(goseq)
## Loading required package: BiasedUrn
## Loading required package: geneLenDataBase
##
library(rtracklayer)
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:geneLenDataBase':
##
## unfactor
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomeInfoDb
library(rrvgo)
library(GO.db)
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
library(forcats)
library(simplifyEnrichment)
## Loading required package: grid
##
## ========================================
## simplifyEnrichment version 1.10.0
## Bioconductor page: https://bioconductor.org/packages/simplifyEnrichment/
## Github page: https://github.com/jokergoo/simplifyEnrichment
## Documentation: https://jokergoo.github.io/simplifyEnrichment/
## Examples: https://simplifyenrichment.github.io/
##
## If you use it in published research, please cite:
## Gu, Z. simplifyEnrichment: an R/Bioconductor package for Clustering and
## Visualizing Functional Enrichment Results, Genomics, Proteomics &
## Bioinformatics 2022.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(simplifyEnrichment))
## ========================================
sessionInfo() #list of packages after library-ing these packages
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.0
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] simplifyEnrichment_1.10.0 forcats_1.0.0
## [3] tidyr_1.3.1 GO.db_3.17.0
## [5] AnnotationDbi_1.62.2 Biobase_2.60.0
## [7] rrvgo_1.12.2 rtracklayer_1.60.1
## [9] GenomicRanges_1.52.1 GenomeInfoDb_1.36.4
## [11] IRanges_2.34.1 S4Vectors_0.38.2
## [13] BiocGenerics_0.46.0 goseq_1.52.0
## [15] geneLenDataBase_1.36.0 BiasedUrn_2.0.11
## [17] ggplot2_3.4.4 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 shape_1.4.6
## [3] rstudioapi_0.15.0 jsonlite_1.8.8
## [5] umap_0.2.10.0 magrittr_2.0.3
## [7] GenomicFeatures_1.52.2 rmarkdown_2.25
## [9] GlobalOptions_0.1.2 BiocIO_1.10.0
## [11] zlibbioc_1.46.0 vctrs_0.6.5
## [13] memoise_2.0.1 Rsamtools_2.16.0
## [15] RCurl_1.98-1.14 askpass_1.2.0
## [17] htmltools_0.5.7 S4Arrays_1.0.6
## [19] progress_1.2.3 curl_5.2.0
## [21] sass_0.4.8 bslib_0.6.1
## [23] cachem_1.0.8 GenomicAlignments_1.36.0
## [25] igraph_2.0.1.1 iterators_1.0.14
## [27] mime_0.12 lifecycle_1.0.4
## [29] pkgconfig_2.0.3 Matrix_1.6-5
## [31] R6_2.5.1 fastmap_1.1.1
## [33] clue_0.3-65 GenomeInfoDbData_1.2.10
## [35] MatrixGenerics_1.12.3 shiny_1.8.0
## [37] digest_0.6.34 colorspace_2.1-0
## [39] RSpectra_0.16-1 RSQLite_2.3.5
## [41] org.Hs.eg.db_3.17.0 filelock_1.0.3
## [43] fansi_1.0.6 httr_1.4.7
## [45] abind_1.4-5 mgcv_1.9-1
## [47] compiler_4.3.2 doParallel_1.0.17
## [49] bit64_4.0.5 withr_3.0.0
## [51] BiocParallel_1.34.2 DBI_1.2.1
## [53] biomaRt_2.56.1 openssl_2.1.1
## [55] proxyC_0.3.4 rappdirs_0.3.3
## [57] DelayedArray_0.26.7 rjson_0.2.21
## [59] tools_4.3.2 httpuv_1.6.14
## [61] glue_1.7.0 restfulr_0.0.15
## [63] nlme_3.1-164 GOSemSim_2.26.1
## [65] promises_1.2.1 gridBase_0.4-7
## [67] cluster_2.1.6 generics_0.1.3
## [69] gtable_0.3.4 data.table_1.15.0
## [71] hms_1.1.3 xml2_1.3.6
## [73] utf8_1.2.4 XVector_0.40.0
## [75] foreach_1.5.2 ggrepel_0.9.5
## [77] pillar_1.9.0 stringr_1.5.1
## [79] later_1.3.2 circlize_0.4.15
## [81] splines_4.3.2 BiocFileCache_2.8.0
## [83] lattice_0.22-5 bit_4.0.5
## [85] tidyselect_1.2.0 ComplexHeatmap_2.16.0
## [87] tm_0.7-11 Biostrings_2.68.1
## [89] knitr_1.45 NLP_0.2-1
## [91] SummarizedExperiment_1.30.2 xfun_0.42
## [93] matrixStats_1.2.0 pheatmap_1.0.12
## [95] stringi_1.8.3 yaml_2.3.8
## [97] evaluate_0.23 codetools_0.2-19
## [99] wordcloud_2.6 tibble_3.2.1
## [101] cli_3.6.2 RcppParallel_5.1.7
## [103] xtable_1.8-4 reticulate_1.35.0
## [105] munsell_0.5.0 jquerylib_0.1.4
## [107] treemap_2.4-4 Rcpp_1.0.12
## [109] dbplyr_2.4.0 png_0.1-8
## [111] XML_3.99-0.16.1 parallel_4.3.2
## [113] ellipsis_0.3.2 blob_1.2.4
## [115] prettyunits_1.2.0 bitops_1.0-7
## [117] slam_0.1-50 scales_1.3.0
## [119] purrr_1.0.2 crayon_1.5.2
## [121] GetoptLong_1.0.5 rlang_1.1.3
## [123] KEGGREST_1.40.1
DEGs <- read.csv(file="../../../output/glmmseq/signif_genes_normcts.csv", sep=',', header=TRUE) %>% dplyr::select(!c('X'))
#NOTE! This is not a file only with differentially expressed genes, this contains all of the genes in our dataset but also contains p-value information and fold change information to help determine which genes are signficant DEGs based on our model in glmmSeq
rownames(DEGs) <- DEGs$Gene
dim(DEGs)
## [1] 9011 75
Origin_DEGs <- DEGs %>% dplyr::filter(Origin < 0.05)
nrow(Origin_DEGs)
## [1] 840
genes <- rownames(DEGs)
Based off of Ariana’s script
Functional annotation file obtained from: http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.EggNog_results.txt.gz on April 3, 2023.
Pacuta.annot <- read.delim("../../../data/Pocillopora_acuta_HIv2.genes.EggNog_results.txt") %>% dplyr::rename("query" = X.query)
dim(Pacuta.annot)
## [1] 16784 21
head(Pacuta.annot)
## query seed_ortholog evalue
## 1 Pocillopora_acuta_HIv2___RNAseq.g24143.t1a 45351.EDO48725 2.16e-120
## 2 Pocillopora_acuta_HIv2___RNAseq.g18333.t1 45351.EDO38694 3.18e-123
## 3 Pocillopora_acuta_HIv2___RNAseq.g7985.t1 192875.XP_004345759.1 1.70e-183
## 4 Pocillopora_acuta_HIv2___RNAseq.g13511.t1 45351.EDO28722 2.94e-48
## 5 Pocillopora_acuta_HIv2___TS.g15308.t1 10224.XP_006813307.1 3.19e-20
## 6 Pocillopora_acuta_HIv2___RNAseq.g2057.t1 106582.XP_004573970.1 3.70e-14
## score
## 1 364.0
## 2 355.0
## 3 526.0
## 4 172.0
## 5 92.4
## 6 68.2
## eggNOG_OGs
## 1 COG0620@1|root,KOG2263@2759|Eukaryota,38GZS@33154|Opisthokonta,3BNKS@33208|Metazoa
## 2 COG0450@1|root,KOG0852@2759|Eukaryota,38B9P@33154|Opisthokonta,3BGS4@33208|Metazoa
## 3 COG3239@1|root,KOG4232@2759|Eukaryota,39UMW@33154|Opisthokonta
## 4 2ED36@1|root,2SITK@2759|Eukaryota,39IIK@33154|Opisthokonta,3C3PY@33208|Metazoa
## 5 COG2801@1|root,KOG0017@2759|Eukaryota,38F42@33154|Opisthokonta,3BA5H@33208|Metazoa
## 6 2CSTD@1|root,2S4A7@2759|Eukaryota,3A79X@33154|Opisthokonta,3BT5E@33208|Metazoa,3D9B1@33213|Bilateria,48FVM@7711|Chordata,49CYZ@7742|Vertebrata,4A886@7898|Actinopterygii
## max_annot_lvl COG_category
## 1 33208|Metazoa E
## 2 33208|Metazoa O
## 3 33154|Opisthokonta I
## 4 33208|Metazoa -
## 5 33208|Metazoa OU
## 6 33208|Metazoa S
## Description Preferred_name
## 1 Cobalamin-independent synthase, Catalytic domain -
## 2 negative regulation of male germ cell proliferation PRDX4
## 3 Fatty acid desaturase -
## 4 - -
## 5 K02A2.6-like -
## 6 Phosphatidylinositol N-acetylglucosaminyltransferase subunit Y PIGY
## GOs
## 1 -
## 2 GO:0000003,GO:0001775,GO:0002252,GO:0002263,GO:0002274,GO:0002275,GO:0002283,GO:0002366,GO:0002376,GO:0002443,GO:0002444,GO:0002446,GO:0003006,GO:0003674,GO:0003824,GO:0004601,GO:0005488,GO:0005515,GO:0005575,GO:0005576,GO:0005615,GO:0005622,GO:0005623,GO:0005737,GO:0005783,GO:0005790,GO:0005829,GO:0006082,GO:0006457,GO:0006464,GO:0006468,GO:0006520,GO:0006575,GO:0006793,GO:0006796,GO:0006807,GO:0006810,GO:0006887,GO:0006915,GO:0006950,GO:0006952,GO:0006955,GO:0006979,GO:0007154,GO:0007165,GO:0007249,GO:0007252,GO:0007275,GO:0007276,GO:0007283,GO:0007548,GO:0008150,GO:0008152,GO:0008219,GO:0008285,GO:0008379,GO:0008406,GO:0008584,GO:0009056,GO:0009266,GO:0009409,GO:0009605,GO:0009607,GO:0009617,GO:0009628,GO:0009636,GO:0009893,GO:0009966,GO:0009967,GO:0009987,GO:0010467,GO:0010604,GO:0010646,GO:0010647,GO:0010941,GO:0010942,GO:0010950,GO:0010952,GO:0012501,GO:0012505,GO:0016043,GO:0016192,GO:0016209,GO:0016310,GO:0016491,GO:0016684,GO:0016999,GO:0017001,GO:0017144,GO:0019222,GO:0019471,GO:0019538,GO:0019725,GO:0019752,GO:0019953,GO:0022414,GO:0022417,GO:0023051,GO:0023052,GO:0023056,GO:0030141,GO:0030162,GO:0030198,GO:0031323,GO:0031325,GO:0031410,GO:0031974,GO:0031982,GO:0031983,GO:0032268,GO:0032270,GO:0032501,GO:0032502,GO:0032504,GO:0032940,GO:0033554,GO:0034774,GO:0035556,GO:0036211,GO:0036230,GO:0042119,GO:0042127,GO:0042221,GO:0042592,GO:0042737,GO:0042742,GO:0042743,GO:0042744,GO:0042802,GO:0042803,GO:0042981,GO:0043062,GO:0043065,GO:0043067,GO:0043068,GO:0043085,GO:0043170,GO:0043207,GO:0043226,GO:0043227,GO:0043229,GO:0043231,GO:0043233,GO:0043280,GO:0043281,GO:0043299,GO:0043312,GO:0043412,GO:0043436,GO:0043900,GO:0043901,GO:0044093,GO:0044237,GO:0044238,GO:0044248,GO:0044260,GO:0044267,GO:0044281,GO:0044421,GO:0044422,GO:0044424,GO:0044433,GO:0044444,GO:0044446,GO:0044464,GO:0044703,GO:0045055,GO:0045137,GO:0045321,GO:0045454,GO:0045862,GO:0046425,GO:0046427,GO:0046546,GO:0046661,GO:0046903,GO:0046983,GO:0048232,GO:0048513,GO:0048518,GO:0048519,GO:0048522,GO:0048523,GO:0048583,GO:0048584,GO:0048608,GO:0048609,GO:0048731,GO:0048856,GO:0050789,GO:0050790,GO:0050794,GO:0050896,GO:0051171,GO:0051173,GO:0051179,GO:0051186,GO:0051187,GO:0051234,GO:0051239,GO:0051241,GO:0051246,GO:0051247,GO:0051336,GO:0051345,GO:0051604,GO:0051704,GO:0051707,GO:0051716,GO:0051920,GO:0052547,GO:0052548,GO:0055114,GO:0060205,GO:0060255,GO:0061458,GO:0065007,GO:0065008,GO:0065009,GO:0070013,GO:0070417,GO:0070887,GO:0071704,GO:0071840,GO:0072593,GO:0080090,GO:0097190,GO:0097237,GO:0097708,GO:0098542,GO:0098754,GO:0098869,GO:0099503,GO:0101002,GO:1901564,GO:1901605,GO:1902531,GO:1902533,GO:1904813,GO:1904892,GO:1904894,GO:1905936,GO:1905937,GO:1990748,GO:2000116,GO:2000241,GO:2000242,GO:2000254,GO:2000255,GO:2001056,GO:2001233,GO:2001235,GO:2001267,GO:2001269
## 3 -
## 4 -
## 5 -
## 6 GO:0000506,GO:0005575,GO:0005622,GO:0005623,GO:0005737,GO:0005783,GO:0005789,GO:0005886,GO:0006464,GO:0006497,GO:0006505,GO:0006506,GO:0006629,GO:0006643,GO:0006644,GO:0006650,GO:0006661,GO:0006664,GO:0006793,GO:0006796,GO:0006807,GO:0008150,GO:0008152,GO:0008610,GO:0008654,GO:0009058,GO:0009059,GO:0009247,GO:0009987,GO:0012505,GO:0016020,GO:0016254,GO:0019538,GO:0019637,GO:0031984,GO:0032991,GO:0034645,GO:0036211,GO:0042157,GO:0042158,GO:0042175,GO:0043170,GO:0043226,GO:0043227,GO:0043229,GO:0043231,GO:0043412,GO:0044237,GO:0044238,GO:0044249,GO:0044255,GO:0044260,GO:0044267,GO:0044422,GO:0044424,GO:0044425,GO:0044432,GO:0044444,GO:0044446,GO:0044464,GO:0045017,GO:0046467,GO:0046474,GO:0046486,GO:0046488,GO:0071704,GO:0071944,GO:0090407,GO:0098796,GO:0098827,GO:1901135,GO:1901137,GO:1901564,GO:1901566,GO:1901576,GO:1902494,GO:1903509,GO:1990234
## EC KEGG_ko
## 1 2.1.1.14 ko:K00549
## 2 1.11.1.15 ko:K03386
## 3 1.14.19.31 ko:K12418
## 4 - -
## 5 - -
## 6 - ko:K11001
## KEGG_Pathway
## 1 ko00270,ko00450,ko01100,ko01110,ko01230,map00270,map00450,map01100,map01110,map01230
## 2 ko04214,map04214
## 3 -
## 4 -
## 5 -
## 6 ko00563,ko01100,map00563,map01100
## KEGG_Module KEGG_Reaction KEGG_rclass
## 1 M00017 R04405,R09365 RC00035,RC00113,RC01241
## 2 - - -
## 3 - - -
## 4 - - -
## 5 - - -
## 6 - R05916 -
## BRITE KEGG_TC CAZy BiGG_Reaction
## 1 ko00000,ko00001,ko00002,ko01000 - - -
## 2 ko00000,ko00001,ko01000,ko04147 - - -
## 3 ko00000,ko01000,ko01004 - - -
## 4 - - - -
## 5 - - - -
## 6 ko00000,ko00001 - - -
## PFAMs
## 1 Meth_synt_2
## 2 1-cysPrx_C,AhpC-TSA
## 3 Cyt-b5,FA_desaturase
## 4 -
## 5 RVT_1,rve
## 6 PIG-Y
genes2annot = match(genes, Pacuta.annot$query) #match genes in DEGs (all genes after filtering) to genes in annotation file
sum(is.na(genes2annot)) #number of genes without EggNog annotation
## [1] 2812
missing<-as.data.frame(genes[which(is.na(genes2annot))]) #dataframe of genes without EggNog annotation
head(missing)
## genes[which(is.na(genes2annot))]
## 1 Pocillopora_acuta_HIv2___RNAseq.g7479.t3a
## 2 Pocillopora_acuta_HIv2___RNAseq.g7479.t3b
## 3 Pocillopora_acuta_HIv2___RNAseq.g682.t1
## 4 Pocillopora_acuta_HIv2___RNAseq.g4805.t1
## 5 Pocillopora_acuta_HIv2___RNAseq.g11061.t1
## 6 Pocillopora_acuta_HIv2___RNAseq.g16217.t1
2812/9011 genes without eggnog annotation
names(Pacuta.annot)
## [1] "query" "seed_ortholog" "evalue" "score"
## [5] "eggNOG_OGs" "max_annot_lvl" "COG_category" "Description"
## [9] "Preferred_name" "GOs" "EC" "KEGG_ko"
## [13] "KEGG_Pathway" "KEGG_Module" "KEGG_Reaction" "KEGG_rclass"
## [17] "BRITE" "KEGG_TC" "CAZy" "BiGG_Reaction"
## [21] "PFAMs"
geneInfo0 = data.frame(gene_id = genes, #add gene id
Accession = Pacuta.annot$seed_ortholog[genes2annot], #add accession number
Bitscore = Pacuta.annot$score[genes2annot], #add bitscore
eValue = Pacuta.annot$evalue[genes2annot], #add e value
Description = Pacuta.annot$Description[genes2annot], #add description of gene
Annotation.GO.ID = Pacuta.annot$GOs[genes2annot], #add GO ID's
q_Origin = DEGs$Origin, #add Origin adjusted p-value
q_Treatment = DEGs$Treatment, #add Treatment adjusted p-value
q_Interaction = DEGs$Treatment.Origin, #add Treatment:Origin adjusted p-value
Stable_OriginFC = DEGs$Stable_OriginFC, #add fold change for Slope vs Flat in the stable treatment
Variable_OriginFC = DEGs$Variable_OriginFC, #add fold change for Slope vs Flat in the variable treatment
maxGroupFC = DEGs$maxGroupFC, #add max group fold change (was FC bigger in stable of variable treatment)
col = DEGs$col) #add qualitative significance info
dim(geneInfo0)
## [1] 9011 13
head(geneInfo0,2)
## gene_id Accession Bitscore eValue
## 1 Pocillopora_acuta_HIv2___RNAseq.g27841.t1 45351.EDO35402 566 4.00e-197
## 2 Pocillopora_acuta_HIv2___RNAseq.g14011.t1 45351.EDO48592 449 1.17e-151
## Description
## 1 cyclin-dependent protein serine/threonine kinase activity
## 2 TAF6-like RNA polymerase II, p300 CBP-associated
## Annotation.GO.ID
## 1 GO:0000003,GO:0003006,GO:0003674,GO:0003824,GO:0004672,GO:0004674,GO:0004693,GO:0005575,GO:0005622,GO:0005623,GO:0005634,GO:0005737,GO:0005813,GO:0005815,GO:0005856,GO:0006464,GO:0006468,GO:0006793,GO:0006796,GO:0006807,GO:0007154,GO:0007165,GO:0007548,GO:0008150,GO:0008152,GO:0009987,GO:0015630,GO:0016301,GO:0016310,GO:0016740,GO:0016772,GO:0016773,GO:0019538,GO:0022414,GO:0023052,GO:0032502,GO:0036211,GO:0043170,GO:0043226,GO:0043227,GO:0043228,GO:0043229,GO:0043231,GO:0043232,GO:0043412,GO:0044237,GO:0044238,GO:0044260,GO:0044267,GO:0044422,GO:0044424,GO:0044430,GO:0044446,GO:0044464,GO:0050789,GO:0050794,GO:0050896,GO:0051716,GO:0051726,GO:0065007,GO:0071704,GO:0097472,GO:0140096,GO:1901564
## 2 GO:0000118,GO:0000123,GO:0003674,GO:0003676,GO:0003677,GO:0003712,GO:0003713,GO:0005488,GO:0005575,GO:0005622,GO:0005623,GO:0005634,GO:0005654,GO:0005730,GO:0006325,GO:0006338,GO:0006355,GO:0006357,GO:0006464,GO:0006473,GO:0006475,GO:0006807,GO:0006996,GO:0008150,GO:0008152,GO:0009889,GO:0009891,GO:0009893,GO:0009987,GO:0010468,GO:0010556,GO:0010557,GO:0010604,GO:0016043,GO:0016569,GO:0016570,GO:0016573,GO:0016604,GO:0016607,GO:0018193,GO:0018205,GO:0018393,GO:0018394,GO:0019219,GO:0019222,GO:0019538,GO:0030914,GO:0031248,GO:0031323,GO:0031325,GO:0031326,GO:0031328,GO:0031974,GO:0031981,GO:0032991,GO:0036211,GO:0043170,GO:0043226,GO:0043227,GO:0043228,GO:0043229,GO:0043231,GO:0043232,GO:0043233,GO:0043412,GO:0043543,GO:0043966,GO:0044237,GO:0044238,GO:0044260,GO:0044267,GO:0044422,GO:0044424,GO:0044428,GO:0044446,GO:0044451,GO:0044464,GO:0045935,GO:0048518,GO:0048522,GO:0050789,GO:0050794,GO:0051171,GO:0051173,GO:0051252,GO:0051254,GO:0051276,GO:0060255,GO:0065007,GO:0070013,GO:0070461,GO:0071704,GO:0071840,GO:0080090,GO:0097159,GO:0140110,GO:1901363,GO:1901564,GO:1902493,GO:1902494,GO:1902680,GO:1903506,GO:1903508,GO:1990234,GO:2000112,GO:2001141
## q_Origin q_Treatment q_Interaction Stable_OriginFC Variable_OriginFC
## 1 0.3325688 1 0.9994279 -0.2177817 -0.1121968
## 2 0.1514413 1 0.9994279 0.6976765 0.2428565
## maxGroupFC col
## 1 Stable Not Significant
## 2 Stable Not Significant
Add KEGG annotation information. Downloaded from: http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.KEGG_results.txt.gz on April 3, 2023.
kegg<-read.table("../../../data/Pocillopora_acuta_HIv2.genes.KEGG_results.txt", sep="", quote="", na.strings=c("","NA"), blank.lines.skip = FALSE, header=FALSE)
dim(kegg)
## [1] 33730 2
head(kegg)
## V1 V2
## 1 Pocillopora_acuta_HIv2___RNAseq.g24143.t1a <NA>
## 2 Pocillopora_acuta_HIv2___RNAseq.g24143.t1b K22584
## 3 Pocillopora_acuta_HIv2___RNAseq.g22918.t1 <NA>
## 4 Pocillopora_acuta_HIv2___RNAseq.g18333.t1 K03386
## 5 Pocillopora_acuta_HIv2___RNAseq.g7985.t1 <NA>
## 6 Pocillopora_acuta_HIv2___RNAseq.g13511.t1 <NA>
Add KEGG annotations to each gene.
geneInfo0$KEGG <- kegg$V2[match(geneInfo0$gene_id, kegg$V1)]
sum(is.na(geneInfo0$KEGG)) #number of genes without KEGG annotation
## [1] 3828
missing_KEGG <- as.data.frame(genes[which(is.na(geneInfo0$KEGG))]) #dataframe of genes without EggNog annotation
head(missing_KEGG)
## genes[which(is.na(geneInfo0$KEGG))]
## 1 Pocillopora_acuta_HIv2___RNAseq.g7479.t3a
## 2 Pocillopora_acuta_HIv2___RNAseq.g7479.t3b
## 3 Pocillopora_acuta_HIv2___RNAseq.g682.t1
## 4 Pocillopora_acuta_HIv2___RNAseq.g4805.t1
## 5 Pocillopora_acuta_HIv2___RNAseq.g11061.t1
## 6 Pocillopora_acuta_HIv2___TS.g24370.t1
Order geneInfo0 by significance of Origin, q_Origin (adjusted p value)
geneInfo <- geneInfo0[order(geneInfo0[, 'q_Origin']), ]
write.csv(geneInfo, file = "../../../output/glmmseq/geneInfo.csv") #gene info for reference/supplement
#geneInfo<-read.csv("../../../output/glmmseq/geneInfo.csv")
dim(geneInfo)
## [1] 9011 14
Get gene length information.
#import file
gff <- rtracklayer::import("../../../data/Pocillopora_acuta_HIv2.genes_fixed.gff3") #if this doesn't work, restart R and try again
transcripts <- subset(gff, type == "transcript") #keep only transcripts , not CDS or exons
transcript_lengths <- width(transcripts) #isolate length of each gene
seqnames<-transcripts$ID #extract list of gene id
lengths<-cbind(seqnames, transcript_lengths)
lengths<-as.data.frame(lengths) #convert to data frame
geneInfo$Length<-lengths$transcript_lengths[match(geneInfo$gene_id, lengths$seqnames)] #Add in length to geneInfo
Format GO terms to remove dashes and quotes and separate by semicolons (replace , with ;) in Annotation.GO.ID column
geneInfo$Annotation.GO.ID <- gsub(",", ";", geneInfo$Annotation.GO.ID)
geneInfo$Annotation.GO.ID <- gsub('"', "", geneInfo$Annotation.GO.ID)
geneInfo$Annotation.GO.ID <- gsub("-", NA, geneInfo$Annotation.GO.ID)
### Generate vector with names of all genes
ALL.vector <- c(geneInfo$gene_id)
### Generate length vector for all genes
LENGTH.vector <- as.integer(geneInfo$Length)
### Generate vector with names of the 840 significant DEGs by Origin
ID.vector <- geneInfo %>%
filter(q_Origin < 0.05) %>%
pull(gene_id)
##Get a list of GO Terms for the all 9011 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, Annotation.GO.ID)
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$Annotation.GO.ID), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "Annotation.GO.ID")
GO.terms <- split2
dim(GO.terms)
## [1] 705483 2
##Construct list of genes with 1 for genes in that are significant for Origin and 0 for those that are not
gene.vector = as.integer(ALL.vector %in% ID.vector) #since we ordered geneInfo by q_Origin, this puts a "1" for the first 840 genes, which are the same genes in ID.vector
names(gene.vector) <- ALL.vector #set names
#weight gene vector by bias for length of gene
pwf <- nullp(gene.vector, ID.vector, bias.data = LENGTH.vector)
#how many go terms from the 840 genes
##Get a list of GO Terms for the 840 DE genes
GO.terms_DE <- geneInfo %>% filter(q_Origin < 0.05) %>%
dplyr::select(gene_id, Annotation.GO.ID)
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms_DE$Annotation.GO.ID), ";")
split2 <- data.frame(v1 = rep.int(GO.terms_DE$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "Annotation.GO.ID")
GO.terms_DE <- split2
dim(GO.terms_DE)
## [1] 46011 2
#run goseq using Wallenius method for all categories of GO terms
GO.wall <- goseq(pwf, ID.vector, gene2cat=GO.terms, method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Write file of results
write.csv(GO, "../../../output/glmmseq/GOSeq/GOseq_DEG_Origin.csv")
#Filtering for p < 0.05
GO_05 <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
#Filtering for p < 0.05
GO_001 <- GO %>%
dplyr::filter(over_represented_pvalue<0.001) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
dim(GO_05)
## [1] 219 7
head(GO_05)
## GOterm over_represented_pvalue under_represented_pvalue numDEInCat
## 1 GO:0086036 0.0004982007 0.9999834 4
## 2 GO:0045939 0.0013830260 0.9998741 5
## 3 GO:0009151 0.0018918989 0.9999601 3
## 4 GO:0009215 0.0018918989 0.9999601 3
## 5 GO:0032963 0.0019057722 0.9998764 4
## 6 GO:0070268 0.0019565847 0.9998722 4
## numInCat term ontology
## 1 6 regulation of cardiac muscle cell membrane potential BP
## 2 11 negative regulation of steroid metabolic process BP
## 3 4 purine deoxyribonucleotide metabolic process BP
## 4 4 purine deoxyribonucleoside triphosphate metabolic process BP
## 5 8 collagen metabolic process BP
## 6 8 cornification BP
dim(GO_001)
## [1] 2 7
GO_001
## GOterm over_represented_pvalue under_represented_pvalue numDEInCat
## 1 GO:0086036 0.0004982007 0.9999834 4
## 2 GO:0051187 0.0001070347 0.9999878 8
## numInCat term ontology
## 1 6 regulation of cardiac muscle cell membrane potential BP
## 2 19 <NA> <NA>
Plotting!
ontologies<-c("BP", "CC", "MF")
for (category in ontologies) {
go_results <- GO_05
go_results<-go_results%>%
filter(ontology==category)%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
#Reduce/collapse GO term set with the rrvgo package
simMatrix <- calculateSimMatrix(go_results$GOterm,
orgdb="org.Ce.eg.db", #c. elegans database
ont=category,
method="Rel")
#calculate similarity
scores <- setNames(-log(go_results$over_represented_pvalue), go_results$GOterm)
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold=0.7,
orgdb="org.Ce.eg.db")
#keep only the goterms from the reduced list
go_results<-go_results%>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results$ParentTerm<-reducedTerms$parentTerm[match(go_results$GOterm, reducedTerms$go)]
filename <- paste0("../../../output/glmmseq/GOSeq/GOseq_DEG_Origin_filtered_P05_", category, ".csv")
write.csv(go_results, filename)
#plot significantly enriched GO terms by Slim Category faceted by slim term
GO.plot <- ggplot2::ggplot(go_results, aes(x = ontology, y = term)) +
geom_point(aes(size=over_represented_pvalue)) +
scale_size(name="Over rep. p-value", trans="reverse", range=c(1,3))+
facet_grid(ParentTerm ~ ., scales = "free", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = 10),
strip.text.x = element_text(size = 20),
axis.text = element_text(size = 8),
axis.title.x = element_blank(),
axis.title.y = element_blank())
print(GO.plot)
ggsave(filename=paste0("../../../output/glmmseq/GOSeq/OriginDEGs_P05", "_", category, ".png"), plot=GO.plot, dpi=300, height=10, units="in", limitsize=FALSE)
}
##
## preparing gene to GO mapping data...
## preparing IC data...
## Saving 7 x 10 in image
## preparing gene to GO mapping data...
##
## preparing IC data...
## Saving 7 x 10 in image
## preparing gene to GO mapping data...
##
## preparing IC data...
## Saving 7 x 10 in image
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- GO_05 %>% dplyr::filter(ontology=="BP") %>% dplyr::select(GOterm) %>% unlist()
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
pdf(file = "../../../output/glmmseq/GOSeq/OriginDEGs_P05_GOSEM.pdf", width = 7, height = 5)
simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50), method = "kmeans")
## Cluster 149 terms by 'kmeans'... 11 clusters, used 0.07031012 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 11 GO lists...
dev.off()
## quartz_off_screen
## 2
#ordered by p-value
GO_plot_all_pval <- GO_001 %>% drop_na(ontology) %>% mutate(term = fct_reorder(term, over_represented_pvalue)) %>%
mutate(term = fct_reorder(term, ontology)) %>%
ggplot( aes(x=term, y=numDEInCat) ) +
geom_segment( aes(x=term ,xend=term, y=0, yend=numDEInCat), color="grey") +
geom_text(aes(label = over_represented_pvalue), hjust = -1, vjust = 0, size = 2) +
geom_point(size=1, aes(colour = ontology)) +
coord_flip() +
#ylim(0,305) +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.position="bottom"
) +
xlab("") +
ylab("Number of Differentially Expressed Genes in GO Term") +
theme_bw() + #Set background color
theme(panel.border = element_blank(), # Set border
panel.grid.major = element_blank(), #Set major gridlines
panel.grid.minor = element_blank(), #Set minor gridlines
axis.line = element_line(colour = "black"), #Set axes color
plot.background=element_blank()) #Set the plot background #set title attributes
GO_plot_all_pval
ggsave("../../../output/glmmseq/GOSeq/OriginDEGs_P001_orderedpval.pdf", GO_plot_all_pval)
## Saving 7 x 5 in image
#ordered by p-value
GO_plot_all_pval <- GO_05 %>% drop_na(ontology) %>% mutate(term = fct_reorder(term, over_represented_pvalue)) %>%
mutate(term = fct_reorder(term, ontology)) %>%
ggplot( aes(x=term, y=numDEInCat) ) +
geom_segment( aes(x=term ,xend=term, y=0, yend=numDEInCat), color="grey") +
geom_text(aes(label = over_represented_pvalue), hjust = -1, vjust = 0, size = 2) +
geom_point(size=1, aes(colour = ontology)) +
coord_flip() +
#ylim(0,305) +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.position="bottom"
) +
xlab("") +
ylab("Number of Differentially Expressed Genes in GO Term") +
theme_bw() + #Set background color
theme(panel.border = element_blank(), # Set border
panel.grid.major = element_blank(), #Set major gridlines
panel.grid.minor = element_blank(), #Set minor gridlines
axis.line = element_line(colour = "black"), #Set axes color
plot.background=element_blank()) #Set the plot background #set title attributes
GO_plot_all_pval
ggsave("../../../output/glmmseq/GOSeq/OriginDEGs_P05_orderedpval.pdf", GO_plot_all_pval, width = 12, height = 24, units = c("in"), dpi=300, limitsize=FALSE)
Plotting with GOplot package
library(GOplot)
## Loading required package: ggdendro
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:BiocGenerics':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## Loading required package: RColorBrewer
GO.terms_genes <- GO.terms %>%
group_by(Annotation.GO.ID) %>%
summarise(Genes = paste(gene, collapse = ', ')) %>% rename(Annotation.GO.ID = "GOterm")
GO_gene <- GO_05 %>% left_join(GO.terms_genes) %>% rename(ontology = "category", GOterm = "ID", over_represented_pvalue= "adj_pval")
## Joining with `by = join_by(GOterm)`
geneInfo_GOplot <- geneInfo %>% rename(gene_id = "ID", Stable_OriginFC = "logFC")
circ <- circle_dat(GO_gene, geneInfo_GOplot)
# Generate a simple barplot
GOBar(subset(circ, category == 'BP'))
# Generate the bubble plot with a label threshold of 3
#GOBubble(circ)
##Get a list of KEGG Terms for all 9011 genes
KO.terms <- geneInfo %>%
dplyr::select(gene_id, KEGG)
#run goseq using Wallenius method for all categories of KEGG terms
KO.wall <-
goseq(
pwf,
ID.vector,
gene2cat = KO.terms,
test.cats = c("KEGG"),
method = "Wallenius",
use_genes_without_cat = TRUE
)
## Using manually entered categories.
## Calculating the p-values...
KO <- KO.wall[order(KO.wall$over_represented_pvalue), ]
colnames(KO)[1] <- "KEGGterm"
#Filtering for p < 0.05
KO_05 <- KO %>%
dplyr::filter(over_represented_pvalue < 0.05) %>%
dplyr::arrange(., over_represented_pvalue)
head(KO_05)
## KEGGterm over_represented_pvalue under_represented_pvalue numDEInCat numInCat
## 1 K01052 0.002747628 0.9999336 3 4
## 2 K01539 0.005115559 1.0000000 2 2
## 3 K05658 0.005129986 1.0000000 2 2
## 4 K05619 0.005222246 1.0000000 2 2
## 5 K11789 0.006212082 0.9996514 3 6
## 6 K01363 0.006678076 1.0000000 2 2
dim(KO_05)
## [1] 21 5
#Write file of results
write.csv(KO, "../../../output/glmmseq/GOSeq/GOseq_KEGG_Origin.csv")
Over-represented KEGG terms, p = 0.05: “K01052” “K01539” “K05658” “K05619” “K11789” “K01363” “K13348” “K01345” “K24436” “K17592” “K25493” “K24125” “K12825” “K08059” “K05929” “K19721” “K14480” “K03671” “K06711” “K13443” “K18405”
freq_fig <- ggplot(go_results, aes(y=numInCat,x=reorder(ParentTerm, numInCat)
#, fill=Percentage.of.shared.GO.terms.with.biomineralization.genes,group=1
))+
#facet_wrap(~Calcification.direction, nrow = 1)+
geom_point(size=5, alpha=1, pch=21,color="black")+
geom_segment(aes(x=ParentTerm, xend=ParentTerm, y=0, yend=numInCat)) +
geom_hline(yintercept = 0, linetype="solid", color = 'black', size=0.5, show.legend = TRUE)+
coord_flip()+
scale_y_continuous(expression(GO~term~counts),limits=c(0,95))+
#scale_color_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
#scale_fill_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
#scale_fill_gradientn(colours=c("white","#fddbc7","#f4a582","#d6604d","#b2182b"), na.value = "grey98",limits = c(0, 100))+
#scale_color_gradientn(colours=c("#b2182b","#fddbc7","white","#d1e5f0","#67a9cf", "#67a9cf", "#2166ac"), na.value = "grey98",limits = c(-0, 40))+
theme_classic()+
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95,size=12),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
legend.title = element_blank(),
axis.text.y = element_text(vjust=0.5, size=12), #making the axis text larger
axis.title.x = element_blank(),#making the axis title larger
axis.title.y = element_blank(),
strip.text = element_text(size=12))#making the axis title larger
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
freq_fig
ggsave(filename="../../../output/glmmseq/GOSeq/OriginDEGs_P05_BP_numInCat.png", plot=freq_fig, dpi=300, height=10, units="in", limitsize=FALSE)
## Saving 7 x 10 in image
freq_fig <- ggplot(go_results, aes(y=numDEInCat,x=reorder(ParentTerm, numDEInCat)
#, fill=Percentage.of.shared.GO.terms.with.biomineralization.genes,group=1
))+
#facet_wrap(~Calcification.direction, nrow = 1)+
geom_point(size=5, alpha=1, pch=21,color="black")+
geom_segment(aes(x=ParentTerm, xend=ParentTerm, y=0, yend=numDEInCat)) +
geom_hline(yintercept = 0, linetype="solid", color = 'black', size=0.5, show.legend = TRUE)+
coord_flip()+
scale_y_continuous(expression(GO~term~counts),limits=c(0,20))+
#scale_color_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
#scale_fill_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
#scale_fill_gradientn(colours=c("white","#fddbc7","#f4a582","#d6604d","#b2182b"), na.value = "grey98",limits = c(0, 100))+
#scale_color_gradientn(colours=c("#b2182b","#fddbc7","white","#d1e5f0","#67a9cf", "#67a9cf", "#2166ac"), na.value = "grey98",limits = c(-0, 40))+
theme_classic()+
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95,size=12),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
legend.title = element_blank(),
axis.text.y = element_text(vjust=0.5, size=12), #making the axis text larger
axis.title.x = element_blank(),#making the axis title larger
axis.title.y = element_blank(),
strip.text = element_text(size=12))#making the axis title larger
freq_fig
ggsave(filename="../../../output/glmmseq/GOSeq/OriginDEGs_P05_BP_numDEInCat.png", plot=freq_fig, dpi=300, height=10, units="in", limitsize=FALSE)
## Saving 7 x 10 in image
### Generate vector with names of the 346 significant DEGs by Origin that are upregulated in the Slope origin in both treatments
ID.vector_upSlope <- geneInfo %>%
filter(q_Origin < 0.05) %>%
filter(Stable_OriginFC > 0 & Variable_OriginFC > 0) %>%
pull(gene_id)
##Construct list of genes with 1 for genes in that are significant for Origin and upregulated in Slope and 0 for those that are not
gene.vector_upSlope = as.integer(ALL.vector %in% ID.vector_upSlope)
names(gene.vector_upSlope) <- ALL.vector #set names
#weight gene vector by bias for length of gene
pwf_upSlope <- nullp(gene.vector_upSlope, ID.vector_upSlope, bias.data = LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall_upSlope <- goseq(pwf_upSlope, ID.vector_upSlope, gene2cat=GO.terms, method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO_upSlope <- GO.wall_upSlope[order(GO.wall_upSlope$over_represented_pvalue),]
colnames(GO_upSlope)[1] <- "GOterm"
#Write file of results
write.csv(GO_upSlope, "../../../output/glmmseq/GOSeq/GOseq_DEG_Origin_upSlope.csv")
#Filtering for p < 0.05
GO_upSlope_05 <- GO_upSlope %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
#Filtering for p < 0.05
GO_upSlope_001 <- GO_upSlope %>%
dplyr::filter(over_represented_pvalue<0.001) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
dim(GO_upSlope_05)
## [1] 422 7
dim(GO_upSlope_001)
## [1] 23 7
There are 422 over-represented GO terms (by p <0.05) in the 346 significant DEGs by Origin that are upregulated in the Slope origin in both treatments
Plotting!
ontologies<-c("BP", "CC", "MF")
for (category in ontologies) {
go_results <- GO_upSlope_05
go_results<-go_results%>%
filter(ontology==category)%>%
filter(over_represented_pvalue != "NA") %>%
filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
#Reduce/collapse GO term set with the rrvgo package
simMatrix <- calculateSimMatrix(go_results$GOterm,
orgdb="org.Ce.eg.db", #c. elegans database
ont=category,
method="Rel")
#calculate similarity
scores <- setNames(-log(go_results$over_represented_pvalue), go_results$GOterm)
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold=0.7,
orgdb="org.Ce.eg.db")
#keep only the goterms from the reduced list
go_results<-go_results%>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results$ParentTerm<-reducedTerms$parentTerm[match(go_results$GOterm, reducedTerms$go)]
#plot significantly enriched GO terms by Slim Category faceted by slim term
GO.plot_upslope <- ggplot2::ggplot(go_results, aes(x = ontology, y = term)) +
geom_point(aes(size=over_represented_pvalue)) +
scale_size(name="Over rep. p-value", trans="reverse", range=c(1,3))+
facet_grid(ParentTerm ~ ., scales = "free", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = 10),
strip.text.x = element_text(size = 20),
axis.text = element_text(size = 8),
axis.title.x = element_blank(),
axis.title.y = element_blank())
print(GO.plot_upslope)
ggsave(filename=paste0("../../../output/glmmseq/GOSeq/OriginDEGs_upslope_P05", "_", category, ".png"), plot=GO.plot_upslope, dpi=100, width=12, height=48, units="in", limitsize=FALSE)
}
## preparing gene to GO mapping data...
## preparing IC data...
## preparing gene to GO mapping data...
## preparing IC data...
## preparing gene to GO mapping data...
## preparing IC data...
### Generate vector with names of the 481 significant DEGs by Origin that are upregulated in the Flat origin in both treatments
ID.vector_upFlat <- geneInfo %>%
filter(q_Origin < 0.05) %>%
filter(Stable_OriginFC < 0 & Variable_OriginFC < 0) %>%
pull(gene_id)
##Construct list of genes with 1 for genes in that are significant for Origin and upregulated in Flat and 0 for those that are not
gene.vector_upFlat = as.integer(ALL.vector %in% ID.vector_upFlat)
names(gene.vector_upFlat) <- ALL.vector #set names
#weight gene vector by bias for length of gene
pwf_upFlat <- nullp(gene.vector_upFlat, ID.vector_upFlat, bias.data = LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall_upFlat <- goseq(pwf_upFlat, ID.vector_upFlat, gene2cat=GO.terms, method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO_upFlat <- GO.wall_upFlat[order(GO.wall_upFlat$over_represented_pvalue),]
colnames(GO_upFlat)[1] <- "GOterm"
#Write file of results
write.csv(GO_upFlat, "../../../output/glmmseq/GOSeq/GOseq_DEG_Origin_upFlat.csv")
#Filtering for p < 0.05
GO_upFlat_05 <- GO_upFlat %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
#Filtering for p < 0.05
GO_upFlat_001 <- GO_upFlat %>%
dplyr::filter(over_represented_pvalue<0.001) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
dim(GO_upFlat_05)
## [1] 352 7
dim(GO_upFlat_001)
## [1] 15 7
There are 243 over-represented GO terms (by p <0.05) in the 481 significant DEGs by Origin that are upregulated in the Flat origin in both treatments
Plotting!
ontologies<-c("BP", "CC", "MF")
for (category in ontologies) {
go_results <- GO_upFlat_05
go_results<-go_results%>%
filter(ontology==category)%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
#Reduce/collapse GO term set with the rrvgo package
simMatrix <- calculateSimMatrix(go_results$GOterm,
orgdb="org.Ce.eg.db", #c. elegans database
ont=category,
method="Rel")
#calculate similarity
scores <- setNames(-log(go_results$over_represented_pvalue), go_results$GOterm)
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold=0.7,
orgdb="org.Ce.eg.db")
#keep only the goterms from the reduced list
go_results<-go_results%>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results$ParentTerm<-reducedTerms$parentTerm[match(go_results$GOterm, reducedTerms$go)]
#plot significantly enriched GO terms by Slim Category faceted by slim term
GO.plot_upFlat <- ggplot2::ggplot(go_results, aes(x = ontology, y = term)) +
geom_point(aes(size=over_represented_pvalue)) +
scale_size(name="Over rep. p-value", trans="reverse", range=c(1,3))+
facet_grid(ParentTerm ~ ., scales = "free", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = 10),
strip.text.x = element_text(size = 20),
axis.text = element_text(size = 8),
axis.title.x = element_blank(),
axis.title.y = element_blank())
print(GO.plot_upFlat)
ggsave(filename=paste0("../../../output/glmmseq/GOSeq/OriginDEGs_upFlat_P05", "_", category, ".png"), plot=GO.plot_upFlat, dpi=100, width=12, height=12, units="in", limitsize=FALSE)
}
## preparing gene to GO mapping data...
## preparing IC data...
## preparing gene to GO mapping data...
## preparing IC data...
## preparing gene to GO mapping data...
## preparing IC data...
Stable_OriginFC > 0 & Variable_OriginFC < 0
### Generate vector with names of the 11 significant DEGs by Origin that are upregulated in the Slope origin in only the stable treatment
ID.vector_Slope_upStable_downVariable <- geneInfo %>%
filter(q_Origin < 0.05) %>%
filter(Stable_OriginFC > 0 & Variable_OriginFC < 0) %>%
pull(gene_id)
##Construct list of genes with 1 for genes in that are significant for Origin and upregulated in Slope and 0 for those that are not
gene.vector_updownSlope = as.integer(ALL.vector %in% ID.vector_Slope_upStable_downVariable)
names(gene.vector_updownSlope) <- ALL.vector #set names
#weight gene vector by bias for length of gene
pwf_updownSlope <- nullp(gene.vector_updownSlope, ID.vector_Slope_upStable_downVariable, bias.data = LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall_updownSlope <- goseq(pwf_updownSlope, ID.vector_Slope_upStable_downVariable, gene2cat=GO.terms, method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO_updownSlope <- GO.wall_updownSlope[order(GO.wall_updownSlope$over_represented_pvalue),]
colnames(GO_updownSlope)[1] <- "GOterm"
#Write file of results
write.csv(GO_updownSlope, "../../../output/glmmseq/GOSeq/GOseq_DEG_Origin_Slope_upStable_downVariable.csv")
#Filtering for p < 0.05
GO_updownSlope_05 <- GO_updownSlope %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
#Filtering for p < 0.001
GO_updownSlope_001 <- GO_updownSlope %>%
dplyr::filter(over_represented_pvalue<0.001) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
dim(GO_updownSlope_05)
## [1] 151 7
dim(GO_updownSlope_001)
## [1] 0 7
There are 151 over-represented GO terms (by p <0.05) in the 11 significant DEGs by Origin that are upregulated in the Slope origin in Stable treatment and downregulated in the variable treatment
Plotting!
ontologies<-c("BP", "CC", "MF")
for (category in ontologies) {
go_results <- GO_updownSlope_05
go_results<-go_results%>%
filter(ontology==category)%>%
filter(over_represented_pvalue != "NA") %>%
filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
#Reduce/collapse GO term set with the rrvgo package
simMatrix <- calculateSimMatrix(go_results$GOterm,
orgdb="org.Ce.eg.db", #c. elegans database
ont=category,
method="Rel")
#calculate similarity
scores <- setNames(-log(go_results$over_represented_pvalue), go_results$GOterm)
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold=0.7,
orgdb="org.Ce.eg.db")
#keep only the goterms from the reduced list
go_results<-go_results%>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results$ParentTerm<-reducedTerms$parentTerm[match(go_results$GOterm, reducedTerms$go)]
#plot significantly enriched GO terms by Slim Category faceted by slim term
GO.plot_updownSlope <- ggplot2::ggplot(go_results, aes(x = ontology, y = term)) +
geom_point(aes(size=over_represented_pvalue)) +
scale_size(name="Over rep. p-value", trans="reverse", range=c(1,3))+
facet_grid(ParentTerm ~ ., scales = "free", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = 10),
strip.text.x = element_text(size = 20),
axis.text = element_text(size = 8),
axis.title.x = element_blank(),
axis.title.y = element_blank())
print(GO.plot_updownSlope)
ggsave(filename=paste0("../../../output/glmmseq/GOSeq/OriginDEGs_updownSlope_P05", "_", category, ".png"), plot=GO.plot_updownSlope, dpi=100, width=12, height=24, units="in", limitsize=FALSE)
}
## preparing gene to GO mapping data...
## preparing IC data...
## preparing gene to GO mapping data...
## preparing IC data...
## preparing gene to GO mapping data...
## preparing IC data...
Stable_OriginFC < 0 & Variable_OriginFC > 0
2 genes: one is unannotated (“Pocillopora_acuta_HIv2___RNAseq.g5107.t1a”) and one is:
Pocillopora_acuta_HIv2___RNAseq.g7761.t1 6087.XP_002155939.2 Belongs to the DHHC palmitoyltransferase family
### Generate vector with names of the 2 significant DEGs by Origin that are upregulated in the Slope origin in both treatments
ID.vector_Slope_downStable_upVariable <- geneInfo %>%
filter(q_Origin < 0.05) %>%
filter(Stable_OriginFC < 0 & Variable_OriginFC > 0) %>%
pull(gene_id)
##Construct list of genes with 1 for genes in that are significant for Origin and upregulated in Slope and 0 for those that are not
gene.vector_downupSlope = as.integer(ALL.vector %in% ID.vector_Slope_downStable_upVariable)
names(gene.vector_downupSlope) <- ALL.vector #set names
#weight gene vector by bias for length of gene
pwf_downupSlope <- nullp(gene.vector_downupSlope, ID.vector_Slope_downStable_upVariable, bias.data = LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall_downupSlope <- goseq(pwf_downupSlope, ID.vector_downupSlope, gene2cat=GO.terms, method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO_downupSlope <- GO.wall_downupSlope[order(GO.wall_downupSlope$over_represented_pvalue),]
colnames(GO_downupSlope)[1] <- "GOterm"
#Write file of results
write.csv(GO_downupSlope, "../../../output/glmmseq/GOSeq/GOseq_DEG_Origin_Slope_downStable_upVariable.csv")
#Filtering for p < 0.05
GO_downupSlope_05 <- GO_downupSlope %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
#Filtering for p < 0.001
GO_downupSlope_001 <- GO_downupSlope %>%
dplyr::filter(over_represented_pvalue<0.001) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
dim(GO_downupSlope_05)
## [1] 11 7
dim(GO_downupSlope_001)
## [1] 0 7
There are 11 over-represented GO terms (by p <0.05) in the 2 significant DEGs by Origin that are downregulated in the Slope origin in Stable treatment and upregulated in the variable treatment
Plotting!
ontologies<-c("BP", "MF")
for (category in ontologies) {
go_results <- GO_downupSlope_05
go_results<-go_results%>%
filter(ontology==category)%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
#Reduce/collapse GO term set with the rrvgo package
simMatrix <- calculateSimMatrix(go_results$GOterm,
orgdb="org.Ce.eg.db", #c. elegans database
ont=category,
method="Rel")
#calculate similarity
scores <- setNames(-log(go_results$over_represented_pvalue), go_results$GOterm)
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold=0.7,
orgdb="org.Ce.eg.db")
#keep only the goterms from the reduced list
go_results<-go_results%>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results$ParentTerm<-reducedTerms$parentTerm[match(go_results$GOterm, reducedTerms$go)]
#plot significantly enriched GO terms by Slim Category faceted by slim term
GO.plot_downupSlope <- ggplot2::ggplot(go_results, aes(x = ontology, y = term)) +
geom_point(aes(size=over_represented_pvalue)) +
scale_size(name="Over rep. p-value", trans="reverse", range=c(1,3))+
facet_grid(ParentTerm ~ ., scales = "free", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = 10),
strip.text.x = element_text(size = 20),
axis.text = element_text(size = 8),
axis.title.x = element_blank(),
axis.title.y = element_blank())
print(GO.plot_downupSlope)
ggsave(filename=paste0("../../../output/glmmseq/GOSeq/OriginDEGs_downupSlope_P05", "_", category, ".png"), plot=GO.plot_downupSlope)
}
## preparing gene to GO mapping data...
## preparing IC data...
## Saving 7 x 5 in image
## preparing gene to GO mapping data...
##
## preparing IC data...
## Saving 7 x 5 in image